In this part, we will visualize the pathways from Over-representation Analysis using heatmap.
knitr::opts_chunk$set(
warning = FALSE, message = FALSE
)
#install.packages("BiocManager")
#library(BiocManager)
#BiocManager::install("DESeq2")
#BiocManager::install("apeglm")
#BiocManager::install("tidyverse") # includes ggplot2 and dplyr
#BiocManager::install("EnhancedVolcano")
#install.packages("plotly")
#BiocManager::install("ComplexHeatmap") #alternative = pheatmap
#BiocManager::install( "AnnotationDbi" )
#install.packages("devtools")
#install.packages("Rtools")
#BiocManager::install("EnsDb.Hsapiens.v86")
# BiocManager::install( "clusterProfiler" )
# BiocManager::install( "enrichplot" )
# BiocManager::install("DOSE")
#BiocManager::install("ReactomePA", force = TRUE)
#BiocManager::install("msigdbr", force = TRUE)
#BiocManager::install("KEGGREST", force = TRUE)
# BiocManager::install("BiocUpgrade") ## you may need this
# BiocManager::install("GOSemSim", force = TRUE)
# install.packages("remotes")
# library(devtools)
# devtools::install_github("GuangchuangYu/GOSemSim")
#install.packages(c("ggraph", "ggnetwork")) #altering ggplots
#devtools::install_github("datapplab/pathview")
#update.packages(c("lattice", "spatial"))
#install.packages("igraph")
#library(DESeq2)
library(AnnotationDbi)
library(devtools)
library(tidyverse) # includes ggplot2, for data visualisation. dplyr, for data manipulation.
library(clusterProfiler) # for PEA analysis
library(org.Hs.eg.db)
library(EnsDb.Hsapiens.v86)
library(DOSE)
library(enrichplot) # for visualisations
library(ggupset) # for visualisations
library(ReactomePA)
library(msigdbr)
library(KEGGREST)
library(httr2)
library(DBI)
library(GOSemSim)
library(lattice)
library(spatial)
library(pathview)
library(cowplot)
library(ggridges)
library(ComplexHeatmap)
library(circlize)
library(igraph)
#Set directory, input and output path
getwd()
## [1] "/home/sant/Downloads/Data Science-bioinformatics learning/Bulk-RNA-seq-analysis-DESeq2/Part 2- Pathway analysis"
in_path <- paste0(getwd(),"/Input/") # input path, where your input data is located
GO_BP <- paste0(getwd(),"/ORA/GO/") # Path for counts table
out_heat_GO <- paste0(getwd(),"/ORA-heatmap/GO/") # output path for heatmap results
####2a-Convert data to gene symbols and create heatmap with gene symbols
#load counts table, Ensgene names as rownames
norm_counts <- read.csv(file= paste0(in_path,"norm_counts.csv")) %>% column_to_rownames(var = "X")
#import the top10 combined pathways
BP_comb_top10 <- read.csv(file = paste0(GO_BP,"BP_top10_correctfinal.csv"))
#create functions to extract data from the enrichGO object, create a list of ensembl genes, and
slice_head(BP_comb_top10, n=20)
#Extract ORA list from results data frame
extract_ORA <- function(x) { #Function to extract geneID and description from enrichGO as a list
y <- x$geneID #extract geneIds from enrichGO results
z <- x$Description #extract Description from enrichGO results
return(list(geneIDs = y, Description = z))
}
#Split gene names from
Split_ORA_l <- function(y,z){# Function to separate the gene names in the list, input geneList and Description
x <- lapply(seq_along(y), function(i){ # y is the geneList from 1st step
unlist(strsplit(y[i], "/")) #split and return the geneIds as a vector
})
names(x) <- z #name of each pathway/list
return(x) #return the function as a list
}
#Convert ORA list from ENTREZ to gene symbols
Sym_ORA <- function(x) { #list of split genes
y <- lapply(x, function(i){ # create a list
z <- mapIds(org.Hs.eg.db, keys = i, keytype = "ENTREZID", column = "SYMBOL") #Use annotationDbi to return the ENSEMBL Ids from Split list
return(z[!is.na(z)])
})
return(y)
}
#Annotate Counts table with gene symbol, remove rownames then convert symbol col to rownames
Counts_S <- norm_counts
Counts_S$symbol <- mapIds(org.Hs.eg.db, keys = rownames(norm_counts), keytype = "ENSEMBL", column = "SYMBOL")
rownames(Counts_S) <- NULL
Counts_S <- Counts_S[!duplicated(Counts_S$symbol),] %>% na.omit(Counts_S)
rownames(Counts_S) <- NULL
Counts_S <- Counts_S %>% column_to_rownames(var = "symbol")
slice_head(Counts_S, n=20)
#Create gene list for heatmap for each pathway by matching ORA list with counts table
ORA_hm_l <- function(x,y) {#x is the ens enrichGO list(from previous step) and y is the counts list
z <- lapply(x, function(i){
k <- y[rownames(y) %in% i,] #k is the list of common genes between the counts table and ens list
return(k)
})
return(z) # z is the whole list
}
#Heatmap function using gene symbols
ORA_hm_S <- function(x, top_n = 25){
k <- lapply(seq_along(x), function(i){ #whole heatmap function
# print(nrow(x[[i]]))
if (nrow(x[[i]]) > 0) { #Make sure there are genes in each list
# print(sapply(x[[i]], is.numeric))
# Subset to top 20 genes
if (nrow(x[[i]]) > top_n) {
w <- x[[i]][1:top_n, ] # Select top 20 rows
} else {
w <- x[[i]] # If less than 20 rows, use the whole dataframe
}
y <- w[, sapply(w, is.numeric)]
z <- scale(y)
# print("Heatmap is being called")
hm <- Heatmap(z, name = "z-score", #heatmap settings and input
row_labels = rownames(w),
column_labels = colnames(y),
column_title = names(x)[i],
column_title_gp = gpar(fontsize = 16), #Bigger title
cluster_rows = T, cluster_columns = T,
col = colorRamp2(c(min(z), 0, max(z)), c("blue", "white", "red"))) # Save heatmap as PNG
pdf(file = paste0(out_heat_GO, "Symbol_", names(x)[i], ".pdf")) # Adjust width and height as needed
draw(hm)
dev.off()
return(hm)
} else { #make sure there are genes in the list or no problems, otherwise it will have no hm and thus return NULL
return(NULL)
}
})
print(k)
}
#Perform all data extraction and heatmap functions
ORA1 <- extract_ORA(BP_comb_top10) #Extract the data from the combined top 10 pathways
ORA2 <- Split_ORA_l(ORA1$geneIDs, ORA1$Description) # Split each gene into a separate character
ORA3a <- Sym_ORA(ORA2) # Convert gene names to symbol
ORA4a <- ORA_hm_l(ORA3a, Counts_S) # Create the gene list with counts using the genes present in normalized counts table
ORA5a <- ORA_hm_S(ORA4a) # Visualize the heatmap
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
print(ORA5a) #Visualize the heatmap
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
####2b-Heatmap with ensembl IDs
# #load counts table, Ensgene names as rownames
# norm_counts <- read.csv(file= paste0(in_path,"norm_counts.csv")) %>% column_to_rownames(var = "X")
#
# #import the top10 combined pathways
# BP_comb_top10 <- read.csv(file = paste0(GO_BP,"BP_top10_correctfinal.csv"))
#
# #create functions to extract data from the enrichGO object, create a list of ensembl genes, and
#
# slice_head(BP_comb_top10, n=10)
#
# #Extract ORA list from results data frame
# extract_ORA <- function(x) { #Function to extract geneID and description from enrichGO as a list
# y <- x$geneID #extract geneIds from enrichGO results
# z <- x$Description #extract Description from enrichGO results
# return(list(geneIDs = y, Description = z))
# }
#
# #Split gene names from
# Split_ORA_l <- function(y,z){# Function to separate the gene names in the list, input geneList and Description
# x <- lapply(seq_along(y), function(i){ # y is the geneList from 1st step
# unlist(strsplit(y[i], "/")) #split and return the geneIds as a vector
# })
# names(x) <- z #name of each pathway/list
# return(x) #return the function as a list
# }
#
# #Convert ORA list from ENTREZ to ENSEMBL ID
# Ens_ORA <- function(x) { #list of split genes
# y <- lapply(x, function(i){ # create a list
# z <- mapIds(org.Hs.eg.db, keys = i, keytype = "ENTREZID", column = "ENSEMBL") #Use annotationDbi to return the ENSEMBL Ids from Split list
# return(z[!is.na(z)])
# })
# return(y)
# }
#
# #Create gene list for heatmap for each pathway by matching ORA list with counts table
# ORA_hm_l <- function(x,y) {#x is the ens enrichGO list(from previous step) and y is the counts list
# z <- lapply(x, function(i){
# k <- y[rownames(y) %in% i,] #k is the list of common genes between the counts table and ens list
# return(k)
# })
# return(z) # z is the whole list
# }
#
# #Function to create heatmap for all pathways.. Loop through each pathway and save as a new file
# ORA_hm <- function(x, top_n = 20){
# k <- lapply(seq_along(x), function(i){ #whole heatmap function
# if (nrow(x[[i]]) > 0) { #Make sure there are genes in each list
# print(sapply(x[[i]], is.numeric))
# print(nrow(x[[i]]))
# # Subset to top 20 genes
# if (nrow(x[[i]]) > top_n) {
# w <- x[[i]][1:top_n, ] # Select top 20 rows
# } else {
# w <- x[[i]] # If less than 20 rows, use the whole dataframe
# }
#
# y <- w[, sapply(w, is.numeric)]
# z <- scale(y)
# print("Heatmap is being called")
# hm <- Heatmap(z, name = "z-score", #heatmap settings and input
# row_labels = rownames(w),
# column_labels = colnames(y),
# column_title = names(x)[i],
# column_title_gp = gpar(fontsize = 14), #Bigger title
# cluster_rows = T, cluster_columns = T,
# col = colorRamp2(c(min(z), 0, max(z)), c("blue", "white", "red"))) # Save heatmap as PNG
# pdf(file = paste0(out_heat_GO, paste0(names(x)[i], ".pdf"))) # Save each file as a pathway name
# draw(hm)
# dev.off()
# return(hm)
# } else { #make sure there are genes in the list or no problems, otherwise it will have no hm and thus return NULL
# return(NULL)
# }
# })
# print(k)
# }
#
#
# #Perform data extraction and heatmap
# ORA1 <- extract_ORA(BP_comb_top10) #Extract the data from the combined top 10 pathways
# ORA2 <- Split_ORA_l(ORA1$geneIDs, ORA1$Description) # Split each gene into a separate character
# ORA3 <- Ens_ORA(ORA2) # Convert gene names to ensembl ID
# ORA4 <- ORA_hm_l(ORA3, norm_counts) # Create the gene list with counts using the genes present in normalized counts table
# ORA5 <- ORA_hm(ORA4) # Visualize the heatmap
sessionInfo()
## R version 4.4.3 (2025-02-28)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/Chicago
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] igraph_2.1.4 circlize_0.4.16
## [3] ComplexHeatmap_2.22.0 ggridges_0.5.6
## [5] cowplot_1.1.3 pathview_1.43.1
## [7] spatial_7.3-17 lattice_0.22-5
## [9] GOSemSim_2.33.0 DBI_1.2.3
## [11] httr2_1.1.0 KEGGREST_1.46.0
## [13] msigdbr_7.5.1 ReactomePA_1.50.0
## [15] ggupset_0.4.1 enrichplot_1.26.6
## [17] DOSE_4.0.0 EnsDb.Hsapiens.v86_2.99.0
## [19] ensembldb_2.30.0 AnnotationFilter_1.30.0
## [21] GenomicFeatures_1.58.0 GenomicRanges_1.58.0
## [23] GenomeInfoDb_1.42.3 org.Hs.eg.db_3.20.0
## [25] clusterProfiler_4.14.4 lubridate_1.9.4
## [27] forcats_1.0.0 stringr_1.5.1
## [29] dplyr_1.1.4 purrr_1.0.4
## [31] readr_2.1.5 tidyr_1.3.1
## [33] tibble_3.2.1 ggplot2_3.5.1
## [35] tidyverse_2.0.0 devtools_2.4.5
## [37] usethis_3.1.0 AnnotationDbi_1.68.0
## [39] IRanges_2.40.1 S4Vectors_0.44.0
## [41] Biobase_2.66.0 BiocGenerics_0.52.0
##
## loaded via a namespace (and not attached):
## [1] splines_4.4.3 later_1.4.1
## [3] BiocIO_1.16.0 bitops_1.0-9
## [5] ggplotify_0.1.2 R.oo_1.27.0
## [7] polyclip_1.10-7 graph_1.84.1
## [9] XML_3.99-0.18 lifecycle_1.0.4
## [11] doParallel_1.0.17 MASS_7.3-64
## [13] magrittr_2.0.3 sass_0.4.9
## [15] rmarkdown_2.29 jquerylib_0.1.4
## [17] yaml_2.3.10 remotes_2.5.0
## [19] httpuv_1.6.15 ggtangle_0.0.6
## [21] sessioninfo_1.2.3 pkgbuild_1.4.6
## [23] RColorBrewer_1.1-3 abind_1.4-8
## [25] pkgload_1.4.0 zlibbioc_1.52.0
## [27] R.utils_2.12.3 ggraph_2.2.1
## [29] RCurl_1.98-1.16 yulab.utils_0.2.0
## [31] rappdirs_0.3.3 tweenr_2.0.3
## [33] GenomeInfoDbData_1.2.13 ggrepel_0.9.6
## [35] tidytree_0.4.6 reactome.db_1.89.0
## [37] codetools_0.2-20 DelayedArray_0.32.0
## [39] ggforce_0.4.2 shape_1.4.6.1
## [41] tidyselect_1.2.1 aplot_0.2.4
## [43] UCSC.utils_1.2.0 farver_2.1.2
## [45] viridis_0.6.5 matrixStats_1.5.0
## [47] GenomicAlignments_1.42.0 jsonlite_1.8.9
## [49] GetoptLong_1.0.5 ellipsis_0.3.2
## [51] tidygraph_1.3.1 iterators_1.0.14
## [53] foreach_1.5.2 tools_4.4.3
## [55] treeio_1.30.0 Rcpp_1.0.14
## [57] glue_1.8.0 gridExtra_2.3
## [59] SparseArray_1.6.1 xfun_0.50
## [61] qvalue_2.38.0 MatrixGenerics_1.18.1
## [63] withr_3.0.2 fastmap_1.2.0
## [65] digest_0.6.37 timechange_0.3.0
## [67] R6_2.6.1 mime_0.12
## [69] gridGraphics_0.5-1 colorspace_2.1-1
## [71] GO.db_3.20.0 RSQLite_2.3.9
## [73] R.methodsS3_1.8.2 generics_0.1.3
## [75] data.table_1.16.4 rtracklayer_1.66.0
## [77] graphlayouts_1.2.2 httr_1.4.7
## [79] htmlwidgets_1.6.4 S4Arrays_1.6.0
## [81] graphite_1.52.0 pkgconfig_2.0.3
## [83] gtable_0.3.6 blob_1.2.4
## [85] XVector_0.46.0 htmltools_0.5.8.1
## [87] profvis_0.4.0 fgsea_1.32.2
## [89] clue_0.3-66 ProtGenerics_1.38.0
## [91] scales_1.3.0 png_0.1-8
## [93] ggfun_0.1.8 knitr_1.49
## [95] rstudioapi_0.17.1 tzdb_0.4.0
## [97] reshape2_1.4.4 rjson_0.2.23
## [99] nlme_3.1-167 curl_6.2.0
## [101] GlobalOptions_0.1.2 cachem_1.1.0
## [103] parallel_4.4.3 miniUI_0.1.1.1
## [105] restfulr_0.0.15 pillar_1.10.1
## [107] vctrs_0.6.5 urlchecker_1.0.1
## [109] promises_1.3.2 cluster_2.1.8
## [111] xtable_1.8-4 Rgraphviz_2.50.0
## [113] KEGGgraph_1.66.0 evaluate_1.0.3
## [115] cli_3.6.4 compiler_4.4.3
## [117] Rsamtools_2.22.0 rlang_1.1.5
## [119] crayon_1.5.3 plyr_1.8.9
## [121] fs_1.6.5 stringi_1.8.4
## [123] viridisLite_0.4.2 BiocParallel_1.40.0
## [125] babelgene_22.9 munsell_0.5.1
## [127] Biostrings_2.74.1 lazyeval_0.2.2
## [129] Matrix_1.7-2 hms_1.1.3
## [131] patchwork_1.3.0 bit64_4.6.0-1
## [133] shiny_1.10.0 SummarizedExperiment_1.36.0
## [135] memoise_2.0.1 bslib_0.9.0
## [137] ggtree_3.14.0 fastmatch_1.1-6
## [139] bit_4.5.0.1 ape_5.8-1
## [141] gson_0.1.0